In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import pandas as pd
In [159]:
df = pd.read_csv('srep00196-s2.csv',header='infer') #read data
In [160]:
df.head()
Out[160]:
food1 food2 cnt
0 black_sesame_seed rose_wine 3
1 fennel wild_berry 5
2 comte_cheese grape 57
3 nira raw_beef 1
4 corn_mint_oil parsnip_fruit 2
In [161]:
df = pd.DataFrame(df)
In [162]:
df['food'] = df['food1'].map(str) + df['food2']
In [163]:
documents = list(df.food)
In [167]:
for i in range(len(documents)):
    documents[i] = documents[i].replace('_',' ')
In [168]:
documents[:10]
Out[168]:
['black sesame seedrose wine',
 'fennelwild berry',
 'comte cheesegrape',
 'niraraw beef',
 'corn mint oilparsnip fruit',
 'soybeanvegetable oil',
 'bell peppernaranjilla',
 'chervilcrayfish',
 'corn mintcream cheese',
 'european cranberrythai pepper']
In [174]:
documents = documents[:500]  #memory issue for my laptop
len(documents)
Out[174]:
500
In [170]:
documents[:10]
Out[170]:
['black sesame seedrose wine',
 'fennelwild berry',
 'comte cheesegrape',
 'niraraw beef',
 'corn mint oilparsnip fruit',
 'soybeanvegetable oil',
 'bell peppernaranjilla',
 'chervilcrayfish',
 'corn mintcream cheese',
 'european cranberrythai pepper']
In [175]:
true_k = 5
vectorizer = TfidfVectorizer(order_centroids, stop_words='english')
X = vectorizer.fit_transform(documents)
In [176]:
model = KMeans(n_clusters=true_k)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print "Cluster %d:" % i,
    for ind in order_centroids[i, :5]:
        print ' %s' % terms[ind],
    print
Top terms per cluster:
Cluster 0:  pork  bean  black  tea  fish
Cluster 1:  bread  codwheaten  soybeanwhite  katsuobushiwheat  cocoawhite
Cluster 2:  cheese  blue  dried  black  lobsterswiss
Cluster 3:  beef  niraraw  caucasroasted  baconraw  brothgrapefruit
Cluster 4:  oil  peel  grass  leaf  palustre
In [ ]:
%matplotlib inline
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

from sklearn.metrics.pairwise import cosine_similarity

To select the best K, we start from 2 to 7 and report the sum of distances and clusters.

In [198]:
ks = []
distance = []

for true_k in range(2,15):
    vectorizer = TfidfVectorizer(order_centroids, stop_words='english')
    X = vectorizer.fit_transform(documents)

    model = KMeans(n_clusters=true_k)
    model.fit(X)
    
    print ("Number of clusters : %d" % true_k)
    print ("Sum of distances: %d " % model.inertia_)
    
    ks.append(true_k)
    distance.append(model.inertia_)
    
    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print "Cluster %d:" % i,
        for ind in order_centroids[i, :5]:
            print ' %s' % terms[ind],
        print
    
    dist = 1 - cosine_similarity(X)

    MDS()

    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]


    #set up colors per clusters using a dict
    cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',4: '#66a61e', 5: '#E24A33',6:'#C4AD66',
                     7:'#fa8174', 8:'#E8000B', 9: '#B0E0E6', 10: '#7A68A6', 11: '#ccebc4', 12: '#4878CF',13:'#03ED3A'}

    #set up cluster names using a dict
    cluster_names = {0: 'cluster 1', 
                 1: 'cluster 2', 
                 2: 'cluster 3', 
                 3: 'cluster 4',
                 4: 'cluster 5',
                5: 'cluster 6',
                6:'cluster 7',
                    7: 'cluster 8',
                    8: 'cluster 9',
                    9: 'cluster 10',
                    10: 'cluster 11',
                    11: 'cluster 12',
                    12: 'cluster 13',
                    13: 'cluster 14'}


    clusters = model.labels_.tolist()

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents)) 

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents)) 


    #group by cluster
    groups = df.groupby('label')


    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
                label=cluster_names[name], color=cluster_colors[name], 
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')
    
    ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for i in range(len(df)):
        ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
    plt.show() #show the plot
Number of clusters : 2
Sum of distances: 489 
Top terms per cluster:
Cluster 0:  cheese  pork  black  bean  beef
Cluster 1:  oil  peel  grass  leaf  palustre
Number of clusters : 3
Sum of distances: 486 
Top terms per cluster:
Cluster 0:  pork  boiled  sesame  lime  seed
Cluster 1:  oil  peel  grass  leaf  palustre
Cluster 2:  cheese  bean  black  beef  tea
Number of clusters : 4
Sum of distances: 481 
Top terms per cluster:
Cluster 0:  pork  black  bean  beef  tea
Cluster 1:  cheese  blue  dried  black  lobsterswiss
Cluster 2:  apple  calamuscooked  labdanumrose  brandycelery  oil
Cluster 3:  oil  peel  grass  leaf  palustre
Number of clusters : 5
Sum of distances: 478 
Top terms per cluster:
Cluster 0:  pork  bean  beef  black  tea
Cluster 1:  oil  peel  grass  leaf  palustre
Cluster 2:  sesame  seed  black  roasted  seedmyrtleberry
Cluster 3:  cabernet  sauvignon  grapemushroom  winefeijoa  grapechamomile
Cluster 4:  cheese  blue  dried  black  lobsterswiss
Number of clusters : 6
Sum of distances: 474 
Top terms per cluster:
Cluster 0:  cheese  bean  beef  bread  pepper
Cluster 1:  pork  smoked  belly  lemon  geraniumgrilled
Cluster 2:  fish  fatty  corn  lean  kohlrabismoked
Cluster 3:  grape  blanc  jonquilmuscadine  vineorange  cloudberrymuscadine
Cluster 4:  black  sesame  cheese  raspberrycapsicum  raspberrycabbage
Cluster 5:  oil  peel  grass  leaf  palustre
Number of clusters : 7
Sum of distances: 471 
Top terms per cluster:
Cluster 0:  cheese  blue  dried  black  cucumberparmesan
Cluster 1:  bean  black  tea  fish  bread
Cluster 2:  chinese  quincepopcorn  quincecinnamon  rum  quincejamaican
Cluster 3:  oil  peel  grass  leaf  palustre
Cluster 4:  pork  smoked  belly  lemon  sausagesherry
Cluster 5:  beef  niraraw  caucasroasted  baconraw  brothgrapefruit
Cluster 6:  lime  callitrismexican  chickenkaffir  juiceraw  citriodoraitalian
Number of clusters : 8
Sum of distances: 470 
Top terms per cluster:
Cluster 0:  bean  beef  bread  pepper  fish
Cluster 1:  cheese  blue  dried  black  cucumberparmesan
Cluster 2:  wine  carnationwhite  cabbagerose  cajeputwhite  mangosherry
Cluster 3:  emmental  chalepensis  cheesefilbert  cocoaruta  oil
Cluster 4:  tea  oilrooibus  corn  imperatoriaseychelles  cantaloupeceylon
Cluster 5:  oil  black  peel  grass  leaf
Cluster 6:  pork  smoked  belly  lemon  sausagesherry
Cluster 7:  juice  fruit  daffodilorange  beerflorida  naranjillapassion
Number of clusters : 9
Sum of distances: 463 
Top terms per cluster:
Cluster 0:  black  sesame  seed  cheese  raspberrycapsicum
Cluster 1:  oil  grass  leaf  palustre  chalepensis
Cluster 2:  cheese  blue  filbertswiss  berryparmesan  imperatoriaswiss
Cluster 3:  bean  guavaraw  chayotemung  pepperred  licoricesnap
Cluster 4:  wine  chinese  cajeputwhite  mangosherry  cabbagerose
Cluster 5:  pork  smoked  belly  lemon  geraniumgrilled
Cluster 6:  chicken  goat  coffeefried  breadraw  cheeseroasted
Cluster 7:  beef  bread  pepper  fish  tea
Cluster 8:  peel  lemon  oil  mandarin  cognacmandarin
Number of clusters : 10
Sum of distances: 463 
Top terms per cluster:
Cluster 0:  cheese  bread  grape  boiled  wine
Cluster 1:  pork  smoked  belly  lemon  sausagesherry
Cluster 2:  fish  corn  mint  fatty  lean
Cluster 3:  tea  oilrooibus  corn  imperatoriaseychelles  cantaloupeceylon
Cluster 4:  bean  black  sesame  cheese  raspberrycapsicum
Cluster 5:  fried  potato  porklicorice  pineappleraw  chickenmint
Cluster 6:  beef  niraraw  caucasroasted  baconraw  brothgrapefruit
Cluster 7:  peel  lemon  oil  mandarin  cognacmandarin
Cluster 8:  oil  grass  palustre  chalepensis  cooked
Cluster 9:  leaf  mozzarella  oil  cheeseturkey  cheeseorange
Number of clusters : 11
Sum of distances: 456 
Top terms per cluster:
Cluster 0:  chicken  coffeefried  breadraw  cheeseroasted  brothraw
Cluster 1:  pork  smoked  belly  lemon  sausagesherry
Cluster 2:  wine  carnationwhite  cabbagerose  cajeputwhite  mangosherry
Cluster 3:  oil  peel  grass  leaf  palustre
Cluster 4:  orange  cornmonkey  treesquid  artichokeisraeli  treepouching
Cluster 5:  beef  niraraw  caucasroasted  baconraw  brothgrapefruit
Cluster 6:  grape  blanc  vineorange  jonquilmuscadine  cloudberrymuscadine
Cluster 7:  cheese  blue  dried  black  cucumberparmesan
Cluster 8:  bread  codwheaten  soybeanwhite  katsuobushiwheat  cocoawhite
Cluster 9:  bean  guavaraw  chayotemung  pepperred  myrtleberrynavy
Cluster 10:  black  fish  pepper  tea  roasted
Number of clusters : 12
Sum of distances: 455 
Top terms per cluster:
Cluster 0:  pork  black  tea  boiled  orange
Cluster 1:  fish  leaf  fatty  corn  oil
Cluster 2:  oil  peel  grass  palustre  chalepensis
Cluster 3:  grape  blanc  vineorange  jonquilmuscadine  cloudberrymuscadine
Cluster 4:  bean  guavaraw  chayotemung  pepperred  lambnavy
Cluster 5:  pepper  cajeputcalifornia  acaciaethiopian  figmalagueta  breadethiopian
Cluster 6:  cheese  blue  dried  black  lobsterswiss
Cluster 7:  wine  carnationwhite  cabbagerose  cajeputwhite  mangosherry
Cluster 8:  lime  callitrismexican  chickenkaffir  juiceraw  citriodoraitalian
Cluster 9:  roasted  sesame  shrimptomato  filbertturmeric  pecanwalnut
Cluster 10:  bread  flower  codwheaten  cocoawhite  peppermintrye
Cluster 11:  beef  niraraw  caucasroasted  baconraw  brothgrapefruit
Number of clusters : 13
Sum of distances: 452 
Top terms per cluster:
Cluster 0:  pork  grape  roasted  fish  tea
Cluster 1:  beef  niraraw  caucasroasted  baconraw  brothgrapefruit
Cluster 2:  oil  peel  grass  leaf  palustre
Cluster 3:  pepper  cajeputcalifornia  acaciaethiopian  figmalagueta  breadethiopian
Cluster 4:  boiled  orange  chickenlamb  beefpeppermint  artichokeisraeli
Cluster 5:  wine  carnationwhite  cabbagerose  cajeputwhite  mangosherry
Cluster 6:  apple  calamuscooked  labdanumrose  brandycelery  oil
Cluster 7:  cheese  blue  lobsterswiss  carawaysheep  filbertswiss
Cluster 8:  raw  peanutvervain  beefwhale  radishuncured  lambroasted
Cluster 9:  bean  guavaraw  chayotemung  pepperred  lambnavy
Cluster 10:  green  bell  dried  tearose  teaparsnip
Cluster 11:  bread  codwheaten  soybeanwhite  katsuobushiwheat  cocoawhite
Cluster 12:  black  cheese  raspberrycapsicum  currantprune  seabreamonion
Number of clusters : 14
Sum of distances: 447 
Top terms per cluster:
Cluster 0:  beef  roasted  wine  fish  juice
Cluster 1:  cheese  corn  mint  fish  blue
Cluster 2:  oil  grass  leaf  palustre  chalepensis
Cluster 3:  black  sesame  cheese  raspberrycabbage  currantprune
Cluster 4:  pork  smoked  belly  lemon  rumsmoked
Cluster 5:  boiled  beefpeppermint  chickenlamb  creamuncured  porkcalifornia
Cluster 6:  grape  blanc  vineorange  jonquilmuscadine  cloudberrymuscadine
Cluster 7:  peel  lemon  oil  mandarin  cognacmandarin
Cluster 8:  bread  codwheaten  soybeanwhite  katsuobushiwheat  cocoawhite
Cluster 9:  bean  mangosteenraw  lambnavy  myrtleberrynavy  licoricesnap
Cluster 10:  apple  calamuscooked  labdanumrose  brandycelery  oil
Cluster 11:  pepper  cajeputcalifornia  acaciaethiopian  figmalagueta  breadethiopian
Cluster 12:  tea  oilrooibus  corn  imperatoriaseychelles  cantaloupeceylon
Cluster 13:  chicken  coffeefried  breadraw  cheeseroasted  brothraw
In [199]:
plt.plot(ks, distance,'ro')  #plot the distance over the number of samples
plt.show()  #show the plot

The best model is 14 with the minimum of total distances of 447

In [202]:
for true_k in range(14,15):
    vectorizer = TfidfVectorizer(order_centroids, stop_words='english')
    X = vectorizer.fit_transform(documents)

    model = KMeans(n_clusters=true_k)
    model.fit(X)
    
    print ("Number of clusters : %d" % true_k)
    print ("Sum of distances: %d " % model.inertia_)
    
    ks.append(true_k)
    distance.append(model.inertia_)
    
    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print "Cluster %d:" % i,
        for ind in order_centroids[i, :5]:
            print ' %s' % terms[ind],
        print
    
    dist = 1 - cosine_similarity(X)

    MDS()

    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]


    #set up colors per clusters using a dict
    cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',4: '#66a61e', 5: '#E24A33',6:'#C4AD66',
                     7:'#fa8174', 8:'#E8000B', 9: '#B0E0E6', 10: '#7A68A6', 11: '#ccebc4', 12: '#4878CF',13:'#03ED3A'}

    #set up cluster names using a dict
    cluster_names = {0: 'cluster 1', 
                 1: 'cluster 2', 
                 2: 'cluster 3', 
                 3: 'cluster 4',
                 4: 'cluster 5',
                5: 'cluster 6',
                6:'cluster 7',
                    7: 'cluster 8',
                    8: 'cluster 9',
                    9: 'cluster 10',
                    10: 'cluster 11',
                    11: 'cluster 12',
                    12: 'cluster 13',
                    13: 'cluster 14'}


    clusters = model.labels_.tolist()

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents)) 

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents)) 


    #group by cluster
    groups = df.groupby('label')


    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
                label=cluster_names[name], color=cluster_colors[name], 
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')
    
    ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for i in range(len(df)):
        ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
    plt.show() #show the plot
Number of clusters : 14
Sum of distances: 448 
Top terms per cluster:
Cluster 0:  boiled  berry  lime  pea  ashanti
Cluster 1:  oil  grass  leaf  palustre  chalepensis
Cluster 2:  beef  cherryfried  caucasroasted  baconraw  niraraw
Cluster 3:  pepper  tea  fish  apple  corn
Cluster 4:  cheese  blue  dried  black  cucumberparmesan
Cluster 5:  fruit  lamb  grilled  juice  teasapodilla
Cluster 6:  grape  blanc  jonquilmuscadine  vineorange  cloudberrymuscadine
Cluster 7:  wine  cajeputwhite  babacosparkling  carnationwhite  mangosherry
Cluster 8:  black  sesame  seed  raspberrycapsicum  seabreamonion
Cluster 9:  bean  guavaraw  lambnavy  pepperred  myrtleberrynavy
Cluster 10:  roasted  sesame  shrimptomato  filbertturmeric  pecanwalnut
Cluster 11:  peel  oil  mandarin  lemon  cognacmandarin
Cluster 12:  pork  smoked  belly  lemon  sausagesherry
Cluster 13:  bread  flower  codwheaten  cocoawhite  peppermintrye